goal

Could the gloves serve as a pseudo-blank that includes the sample collection step?

libs/input/output

libraries

library(phyloseq); packageVersion("phyloseq")
## [1] '1.44.0'
# library(Biostrings); packageVersion("Biostrings")
library(ggplot2); packageVersion("ggplot2")
## [1] '3.4.2'
library(dplyr); packageVersion("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## [1] '1.1.2'
library(ggrepel); packageVersion("ggrepel")
## [1] '0.9.3'
require(broman); packageVersion("broman") #optional
## Loading required package: broman
## [1] '0.80'

counts data

DADA2_part2 = dir("../..", pattern="DADA2_part2", full.names = T, include.dirs = T)
countsFile = file.path(DADA2_part2, "output", "asv-counts.txt")
counts = read.delim2(countsFile, row.names = 1)
dim(counts)
## [1] 3008  952
counts = counts[rowSums(counts) > 0,]
dim(counts)
## [1] 2915  952
head(row.names(counts))
## [1] "ERR1459793" "ERR1459883" "ERR1460576" "ERR1460577" "ERR1460578"
## [6] "ERR1460579"
head(colnames(counts))
## [1] "ASV.1" "ASV.2" "ASV.3" "ASV.4" "ASV.5" "ASV.6"

metadata

metaReduced = dir("../..", pattern="metadata_PRJEB14474", full.names = T, include.dirs = T)
metaFile = file.path(metaReduced, "output", "PRJEB14474_SraRunTable_reduced.txt")
meta = read.delim2(metaFile)
row.names(meta) = meta$ID
message("Read metadata from: ", metaFile)
## Read metadata from: ../../02_review_metadata_PRJEB14474/output/PRJEB14474_SraRunTable_reduced.txt

We will need to describe days as being “pre-opening” or “post-opening”. This information is held is “study_day”. Make a new variable that is a simple category instead of a numeric.

meta$opening = "before opening"
meta$opening[meta$study_day > 0] = "after opening"

We should have meta data for all samples we had data for…and for some samples that have been filtered out. Limit the meta data to only include samples we have in the data.

inData = row.names(counts)
meta = meta[inData, ]
dim(meta)
## [1] 2915   14

taxonomy

assignTax = dir("../..", pattern="assign_ASV_taxonomy", full.names = T, include.dirs = T)
taxaFile = file.path(assignTax, "output/silva/asvTaxaTable_silva-v138.txt")
taxa = read.delim2(taxaFile, row.names = "asvID")
# drop the ASV column
taxaLim = taxa[,grep("ASV", names(taxa), invert = T, value = T)]
taxaMat = as.matrix(taxaLim)

direct output

# tmpDir = "../temp"
# suppressWarnings(dir.create(tmpDir, recursive = T))
# 
outDir = "../output"
suppressWarnings(dir.create(outDir, recursive = T))

set seed

The seed each time that the dirction of the pcoa matters; like times that we add annotations or comments with hard-coded values based on the pcoa.

seed = 22
set.seed(seed)

Ordination

colors

Lets lay out a set of colors to use for each type / category to be consistent between plots.

# library(broman)
myProjectPalette = c(
  # match paper figures
  before = "#2E5473",
  'before opening'= "#2E5473",
  Preopening = "#2E5473",
  after = "#BB302F",
  'after opening'= "#BB302F",
  Postopening = "#BB302F",
  # Gloves (with extended categories)
  Glove1 = "deepskyblue",
  Glove2 = "deepskyblue3",
  'Glove from Glove Box' = "deepskyblue1",
  Glove = "deepskyblue1",
  'blank control' = "black", #white? consider: ("#fefe22") brocolors("crayons")["Laser Lemon"]
  # Staff_Surface >> muted blue
  'Personal Cell Phone' = "#a2a2d0", # brocolors("crayons")["Blue Bell"]
  'Hospital Pager' = "#dbd7d2", # brocolors("crayons")["Timberwolf"]
  'Shirt Hem' = "#c5d0e6", # brocolors("crayons")["Periwinkle"]
  Shoe = "#b0b7c6", # brocolors("crayons")["Cadet Blue"]
  # Station_Surface >> purples
  'Countertop' = "#dd4492", # brocolors("crayons")["Cerise"] 
  'Computer Mouse' = "#c364c5", # brocolors("crayons")["Fuchsia"]
  'Station Phone'= "#f664af", # brocolors("crayons")["Magenta"]
  'Chair Armrest' = "#de5d83", # brocolors("crayons")["Blush"]
  'Corridor Floor' = "purple", 
  # (Room_Surface | Station_Surface) > Water Faucet Handle >> green
  'Hot Tap Water Faucet Handle' = "#71bc78", # brocolors("crayons")["Fern"]  
  'Cold Tap Water Faucet Handle'= "#87a96b", # brocolors("crayons")["Asparagus"]
  'Cold Tap Faucet Handle'="#a8e4a0" , # brocolors("crayons")["Granny Smith Apple"]
  # Patient_Skin | Staff_Skin >> organic?
  'Inguinal Fold'="#ffa089" , # brocolors("crayons")["Vivid Tangerine"]
  Hand="#fae7b5" , # brocolors("crayons")["Banana Mania"]
  Nose = "#ffcf48" , # brocolors("crayons")["Sunglow"]
  # Nose="#c5e384" , # brocolors("crayons")["Yellow Green"]
  Axilla="#f78fa7" , # brocolors("crayons")["Pink Sherbert"]
  # Room_Surface >> greenish/blueish
  Floor = "#2b6cc4" , # brocolors("crayons")["Denim"]
  Bedrail = "#aaf0d1" # brocolors("crayons")["Magic Mint"]
)

# to use this with ggplot2, add a scale_*_manual layer:
# plot1 + 
#    scale_colour_manual(values = myProjectPalette) 

ordination function

In the legend, Figure 1A is described as:

  1. Principal coordinate analysis (PCoA) of all floor samples based on weighted UniFrac distance and colored by whether they were taken before or after the hospital’s opening.
# alt syntax: 
# doPCoA = function(counts=counts, meta=meta, taxaMat=taxaMat, sampleType="Floor"){

doPCoA = function(sampleType="Floor", colorBy = "opening"){
  sampleSet = meta[meta$SAMPLE_TYPE %in% sampleType, "ID"]
  countslim = counts[sampleSet,]
  
  ps <- phyloseq(otu_table(counts[sampleSet,], taxa_are_rows=FALSE), 
                 sample_data(meta[sampleSet,]), 
                 tax_table(taxaMat))
  ps.prop <- transform_sample_counts(ps, function(otu) otu/sum(otu))
  ord <- ordinate(ps.prop, method="PCoA", distance="bray") # TODO: switch to unifrac to match paper
  po = plot_ordination(ps.prop, ord, shape="surcat", color=colorBy, 
                       title=paste0("bray PCoA - ", paste0(sampleType, collapse=", "))) +
    scale_colour_manual(values = myProjectPalette) 
  return(list(ord=ord, plotA=po))
}

set.seed(seed)
gloves = doPCoA(sampleType="Glove from Glove Box")
gloves$plotA

ordGlove = gloves$ord$vectors[,1:3] %>% data.frame()
ordGlove$ID = row.names(ordGlove)
ordGlove = merge(ordGlove, meta)
row.names(ordGlove) = ordGlove$ID

p1 = ggplot(data.frame(ordGlove), aes(x=Axis.1, y=Axis.2, col=SAMPLE_TYPE, label=ID)) + 
  geom_point() +
  scale_colour_manual(values = myProjectPalette) +
  ggtitle("Unused Glove")
p1

Ax1.cut = 0.08
p1a = p1 + 
  # abline parameters and Ax1.cut were set based on a plot made after set.seed(22)
  geom_abline(slope=1.1, intercept = -0.1, linetype=2, col="gray") +
  geom_vline(xintercept = Ax1.cut)
p1a

p1b = p1a + geom_text_repel(show.legend = F)
p1b
## Warning: ggrepel: 226 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ggsave(file.path(outDir, "UnusedGlove_PCOA.png"))
## Saving 7 x 5 in image
## Warning: ggrepel: 226 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Using just axis.1 I can capture most of group2. Only two samples (ERR1461892 and ERR1461651) are left out.

group1 = row.names(ordGlove)[ordGlove[,"Axis.1"] < Ax1.cut]
group2 = row.names(ordGlove)[ordGlove[,"Axis.1"] > Ax1.cut]
ordGlove$group = NA
ordGlove[group1,"group"] = "Glove1"
ordGlove[group2,"group"] = "Glove2"

ggplot(data.frame(ordGlove), aes(x=Axis.1, y=Axis.2, col=group, label=ID)) + 
  geom_point() +
  scale_colour_manual(values = myProjectPalette) +
  ggtitle("Unused Glove (grouped)")

allglove = table(meta$study_day[meta$SAMPLE_TYPE=="Glove from Glove Box"]) %>% data.frame()
allglovedf = data.frame(study_day=allglove$Var1, total=allglove$Freq)
g2glove = table(meta[group2, "study_day"]) %>% data.frame()
names(g2glove) = c("study_day", "g2")
df = merge(allglovedf, g2glove, all=T)
df$g2[is.na(df$g2)] = 0
df$portionG2 = df$g2 / df$total

table(df$portionG2)
## 
##                 0             0.375 0.666666666666667               0.8 
##                95                 1                 1                 1 
##               0.9                 1 
##                 1                14

There are 95 days where all glove samples are in group1, and 13 days where all glove samples are in group2. And 5 days are a mix. The mix days are:

df[df$portionG2 > 0 & df$portionG2 < 1,]
##     study_day total g2 portionG2
## 46         66     5  4 0.8000000
## 97        283     6  4 0.6666667
## 107       311    10  9 0.9000000
## 109       318     8  3 0.3750000

How many glove samples are usually taken on the same day?

table(df$total)
## 
##  1  2  3  4  5  6  8  9 10 
## 39 61  2  1  2  2  2  3  1

This does not distinguish it by room.

all.glove = meta %>% 
  filter(SAMPLE_TYPE == "Glove from Glove Box") %>% 
  count(study_day, Location) %>%
  mutate(total=n) %>% select(-n)

g2.glove = meta %>% 
  filter(ID %in% group2) %>% 
  count(study_day, Location) %>%
  mutate(g2=n) %>% select(-n)

m2 = merge(all.glove, g2.glove, all=T)
m2$g2[is.na(m2$g2)] = 0
m2$portionG2 = as.factor(m2$g2 / m2$total)

p1f = ggplot(m2, aes(x=study_day, y=Location, fill=portionG2)) +
  geom_tile() 
p1f

p1f + xlim(c(0,10))
## Warning: Removed 224 rows containing missing values (`geom_tile()`).

p1f + xlim(c(55,105))
## Warning: Removed 219 rows containing missing values (`geom_tile()`).

p1f + xlim(c(220,270))
## Warning: Removed 223 rows containing missing values (`geom_tile()`).

p1f + xlim(c(275,330))
## Warning: Removed 176 rows containing missing values (`geom_tile()`).

Save images.

pdf(file.path(outDir, "gloveCollectionDays.pdf"))
p1f
p1f + xlim(c(0,10))
## Warning: Removed 224 rows containing missing values (`geom_tile()`).
p1f + xlim(c(55,105))
## Warning: Removed 219 rows containing missing values (`geom_tile()`).
p1f + xlim(c(220,270))
## Warning: Removed 223 rows containing missing values (`geom_tile()`).
p1f + xlim(c(275,330))
## Warning: Removed 176 rows containing missing values (`geom_tile()`).
dev.off()
## quartz_off_screen 
##                 2

Where would my left-out samles fall?

leftOut = c("ERR1461892", "ERR1461651")
meta[leftOut, c("ID", "study_day", "Location")]
##                    ID study_day Location
## ERR1461892 ERR1461892       283   R10015
## ERR1461651 ERR1461651       102   R10016

gloves with others

sampleTypes = unique(meta$SAMPLE_TYPE)

meta[group1,"SAMPLE_TYPE"] = "Glove1"
meta[group2,"SAMPLE_TYPE"] = "Glove2"

eachVsGlove = list()
for (type in sampleTypes){
  eachVsGlove[[type]] = doPCoA(sampleType = c("Glove1", "Glove2", type), colorBy="SAMPLE_TYPE")
  show(eachVsGlove[[type]]$plotA)
}

sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] broman_0.80     ggrepel_0.9.3   dplyr_1.1.2     ggplot2_3.4.2  
## [5] phyloseq_1.44.0
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.3            xfun_0.39               bslib_0.4.2            
##  [4] rhdf5_2.44.0            Biobase_2.60.0          lattice_0.21-8         
##  [7] rhdf5filters_1.12.1     vctrs_0.6.2             tools_4.3.0            
## [10] bitops_1.0-7            generics_0.1.3          biomformat_1.28.0      
## [13] stats4_4.3.0            parallel_4.3.0          tibble_3.2.1           
## [16] fansi_1.0.4             highr_0.10              cluster_2.1.4          
## [19] pkgconfig_2.0.3         Matrix_1.5-4            data.table_1.14.8      
## [22] S4Vectors_0.38.1        lifecycle_1.0.3         GenomeInfoDbData_1.2.10
## [25] farver_2.1.1            compiler_4.3.0          stringr_1.5.0          
## [28] textshaping_0.3.6       Biostrings_2.68.1       munsell_0.5.0          
## [31] codetools_0.2-19        permute_0.9-7           GenomeInfoDb_1.36.0    
## [34] htmltools_0.5.5         sass_0.4.6              RCurl_1.98-1.12        
## [37] yaml_2.3.7              pillar_1.9.0            crayon_1.5.2           
## [40] jquerylib_0.1.4         MASS_7.3-58.4           cachem_1.0.8           
## [43] vegan_2.6-4             iterators_1.0.14        foreach_1.5.2          
## [46] nlme_3.1-162            tidyselect_1.2.0        digest_0.6.31          
## [49] stringi_1.7.12          reshape2_1.4.4          labeling_0.4.2         
## [52] splines_4.3.0           ade4_1.7-22             fastmap_1.1.1          
## [55] grid_4.3.0              colorspace_2.1-0        cli_3.6.1              
## [58] magrittr_2.0.3          survival_3.5-5          utf8_1.2.3             
## [61] ape_5.7-1               withr_2.5.0             scales_1.2.1           
## [64] rmarkdown_2.21          XVector_0.40.0          multtest_2.56.0        
## [67] igraph_1.4.3            ragg_1.2.5              evaluate_0.21          
## [70] knitr_1.42              IRanges_2.34.0          mgcv_1.8-42            
## [73] rlang_1.1.1             Rcpp_1.0.10             glue_1.6.2             
## [76] BiocGenerics_0.46.0     rstudioapi_0.14         jsonlite_1.8.4         
## [79] R6_2.5.1                Rhdf5lib_1.22.0         plyr_1.8.8             
## [82] systemfonts_1.0.4       zlibbioc_1.46.0